
global projectdir "~"
global datadir "$projectdir/data"
global intermed "$projectdir/data/intermediate_files"

set more off

******************************************************************************
******************************************************************************
* Hold CPI values in memory
* Payroll growth and wages ajusted for cpi
import delimited "$datadir/cpi_quarterly.csv", clear 
bysort year: egen cpi_2014=mean(cpa)
keep year cpi_2014
duplicates drop year, force
gen cpi_2014_mult=100/cpi

sort year

gen cpi_2014_mult_t3=cpi_2014_mult[_n+3]
gen cpi_2014_mult_t5=cpi_2014_mult[_n+5]

compress
tempfile cpi_2014
save `cpi_2014', replace

******************************************************************************
******************************************************************************
* Get PIKs from LEHD and W2 job history files, and combine into a full list, 
*   marking the source of each.
* Then create a balanced PIK-year panel.

cd $datadir/intermediate_files
use iris_lehd_jobhistory_2018q4, clear
keep pik
duplicates drop
tempfile lehd_piks
save `lehd_piks', replace

use iris_w2_jobhistory_2018q4, clear
keep if flag_emplid_w2 == 1
keep pik
duplicates drop
merge 1:1 pik using `lehd_piks'
gen flag_pik_source = ""
replace flag_pik_source = "LEHD and W2" if _merge == 3
replace flag_pik_source = "LEHD Only" if _merge == 2
replace flag_pik_source = "W2 Only" if _merge == 1
drop _merge
tab flag_pik_source

* Create a numeric PIK identifier for tsset below.
egen pik_num = group(pik)

* Create balanced panel
gen startyear = 2001
gen endyear = 2017
gen year_range = endyear - startyear + 1
expand year_range
by pik, sort: gen year = startyear + _n - 1

sort pik year
keep pik_num pik year flag_pik_source
order pik_num pik year flag_pik_source

tsset pik_num year

compress
save pik_year_panel, replace
******************************************************************************
******************************************************************************




******************************************************************************
******************************************************************************
* Use LEHD job file to create outcomes at the PIK-year level. These will be
*  merged to the PIK-year panel created above.

cd $datadir/intermediate_files
use iris_lehd_jobhistory_2018q4, clear

****** ADD IN UNIVERSITY CARNEGIE CLASS DATA FROM IPEDS*****
merge m:1 year ein using "$datadir/ipeds/uni_carnegie_class.dta", update keepusing(carnegie)
drop if _merge==2
drop _merge

merge m:1 ein using "$datadir/ipeds/uni_carnegie_class_mode_one_per_ein.dta", update keepusing(minmode_carnegie)
drop if _merge==2
drop _merge

replace carnegie = minmode_carnegie if missing(carnegie)
drop minmode_carnegie
replace carnegie = 9999 if carnegie<0

*Was the person working at a umetrics EIN or SEIN
merge m:1 pik year using "$datadir/raw_pulls/umetrics/FSRDC_2018/pik_year_at_umetricsjob.dta", update
gen umetrics_job_year = 1 if _merge>2
drop if _merge==2
drop _merge
***************

keep pik sein spell_u2w es_state seinunit source qtime wage year quarter mode_naics* firmid firmid_edit fas_ein firmage naics*_imp univ_ein carnegie umetrics_job_year
order pik sein seinunit spell_u2w qtime
sort pik sein seinunit spell_u2w qtime

*duplicates tag pik sein spell_u2w seinunit qtime, gen(dup)
*tab dup
*drop dup


* Create indicators for startup, young and incumbent firms
gen start = (firmage <= 0 & univ_ein == 0)
gen young = (firmage > 0 & firmage <= 5 & univ_ein == 0)
gen incum = (firmage > 5 & univ_ein == 0)

* Create High-Tech indicators. 
* Do this for NAICS years 2007, 2012, and 2017.
* Use both the modal seinunit for a given sein-quarter (seinunit_mode) and the seinunit at which
*    the PIK was ACTUALLY EMPLOYED in a given quarter (seinunit_job).

local years 2007 2012 2017
foreach year in `years' {

gen naics4_seinunit_mode = regexs(1) if regexm(mode_naics`year'fnl_emp, "^([0-9][0-9][0-9][0-9])")
gen naics4_seinunit_job = regexs(1) if regexm(naics`year'fnl_imp, "^([0-9][0-9][0-9][0-9])")

local levels "seinunit_mode seinunit_job"
foreach level in `levels' {

gen ht_`level'_`year' = 0
replace ht_`level'_`year' = 1 if naics4_`level' == "1131"
replace ht_`level'_`year' = 1 if naics4_`level' == "1132"
replace ht_`level'_`year' = 1 if naics4_`level' == "2111"
replace ht_`level'_`year' = 1 if naics4_`level' == "2211"
replace ht_`level'_`year' = 1 if naics4_`level' == "3241"
replace ht_`level'_`year' = 1 if naics4_`level' == "3251"
replace ht_`level'_`year' = 1 if naics4_`level' == "3252"
replace ht_`level'_`year' = 1 if naics4_`level' == "3253"
replace ht_`level'_`year' = 1 if naics4_`level' == "3254"
replace ht_`level'_`year' = 1 if naics4_`level' == "3255"
replace ht_`level'_`year' = 1 if naics4_`level' == "3259"
replace ht_`level'_`year' = 1 if naics4_`level' == "3332"
replace ht_`level'_`year' = 1 if naics4_`level' == "3336"
replace ht_`level'_`year' = 1 if naics4_`level' == "3339"
replace ht_`level'_`year' = 1 if naics4_`level' == "3341"
replace ht_`level'_`year' = 1 if naics4_`level' == "3342"
replace ht_`level'_`year' = 1 if naics4_`level' == "3343"
replace ht_`level'_`year' = 1 if naics4_`level' == "3344"
replace ht_`level'_`year' = 1 if naics4_`level' == "3345"
replace ht_`level'_`year' = 1 if naics4_`level' == "3346"
replace ht_`level'_`year' = 1 if naics4_`level' == "3353"
replace ht_`level'_`year' = 1 if naics4_`level' == "3364"
replace ht_`level'_`year' = 1 if naics4_`level' == "3369"
replace ht_`level'_`year' = 1 if naics4_`level' == "4324"
replace ht_`level'_`year' = 1 if naics4_`level' == "4861"
replace ht_`level'_`year' = 1 if naics4_`level' == "4862"
replace ht_`level'_`year' = 1 if naics4_`level' == "4869"
replace ht_`level'_`year' = 1 if naics4_`level' == "5112"
replace ht_`level'_`year' = 1 if naics4_`level' == "5161"
replace ht_`level'_`year' = 1 if naics4_`level' == "5171"
replace ht_`level'_`year' = 1 if naics4_`level' == "5172"
replace ht_`level'_`year' = 1 if naics4_`level' == "5173"
replace ht_`level'_`year' = 1 if naics4_`level' == "5174"
replace ht_`level'_`year' = 1 if naics4_`level' == "5179"
replace ht_`level'_`year' = 1 if naics4_`level' == "5181"
replace ht_`level'_`year' = 1 if naics4_`level' == "5182"
replace ht_`level'_`year' = 1 if naics4_`level' == "5211"
replace ht_`level'_`year' = 1 if naics4_`level' == "5232"
replace ht_`level'_`year' = 1 if naics4_`level' == "5413"
replace ht_`level'_`year' = 1 if naics4_`level' == "5415"
replace ht_`level'_`year' = 1 if naics4_`level' == "5416"
replace ht_`level'_`year' = 1 if naics4_`level' == "5417"
replace ht_`level'_`year' = 1 if naics4_`level' == "5511"
replace ht_`level'_`year' = 1 if naics4_`level' == "5612"
replace ht_`level'_`year' = 1 if naics4_`level' == "8112"

}

drop naics4_seinunit_mode naics4_seinunit_job

}

* The file lehd_sein_any_ht contains additional information on the NAICS codes with
*   which a PIK is affiliated. In particular, it provides the count of SEINUNITS that
*   are high-tech for the SEIN at which teh PIK is employed in a given year. 
*   It also provides the count of SEINUNITS that are high-tech and EVER affiliated with
*   the SEIN at which the PIK is employed.
* This is created using the file: lehd_sein_any_ht.do
cd $datadir/intermediate_files
merge m:1 sein qtime using lehd_sein_any_ht
drop if _merge == 2
drop _merge


drop seinunit_any_count seinunit_any_ever_count

* Create indicator varibles from the counts
local years 2007 2012 2017
foreach year in `years' {

gen ht_seinunit_any_ever_`year' = 1 if ht_seinunit_any_ever_`year'_count > 0
replace ht_seinunit_any_ever_`year' = 0 if missing(ht_seinunit_any_ever_`year')
drop ht_seinunit_any_ever_`year'_count

gen ht_seinunit_any_`year' = 1 if ht_seinunit_any_`year'_count > 0
replace ht_seinunit_any_`year' = 0 if missing(ht_seinunit_any_`year')
drop ht_seinunit_any_`year'_count

}


* Shorten names
local years 2007 2012 2017
foreach year in `years' {

	rename ht_seinunit_mode_`year' ht_mode_`year'
	rename ht_seinunit_job_`year' ht_job_`year'
	rename ht_seinunit_any_`year' ht_any_`year'
	rename ht_seinunit_any_ever_`year' ht_any_ever_`year'
}

keep pik sein seinunit es_state spell_u2w qtime wage year quarter firmid firmid_edit fas_ein source firmage univ_ein start young incum ht_* carnegie umetrics_job_year
order pik sein seinunit spell_u2w qtime wage year quarter firmid firmid_edit fas_ein source firmage univ_ein start young incum ///
      ht_job_* ht_mode_* ht_any_2007 ht_any_2012 ht_any_2017 ht_any_ever_*

label var ht_job_2007 "SEINUNIT of Job High-Tech at Time of Employment (2007 NAICS)"
label var ht_job_2012 "SEINUNIT of Job High-Tech at Time of Employment (2012 NAICS)"
label var ht_job_2017 "SEINUNIT of Job High-Tech at Time of Employment (2017 NAICS)"

label var ht_mode_2007 "Modal SEINUNIT of SEIN High-Tech at Time of Employment (2007 NAICS)"
label var ht_mode_2012 "Modal SEINUNIT of SEIN High-Tech at Time of Employment (2012 NAICS)"
label var ht_mode_2017 "Modal SEINUNIT of SEIN High-Tech at Time of Employment (2017 NAICS)"

label var ht_any_2007 "Any SEINUNIT of SEIN High-Tech at Time of Employment (2007 NAICS)"
label var ht_any_2012 "Any SEINUNIT of SEIN High-Tech at Time of Employment (2012 NAICS)"
label var ht_any_2017 "Any SEINUNIT of SEIN High-Tech at Time of Employment (2017 NAICS)"

label var ht_any_ever_2007 "Any SEINUNIT of SEIN High-Tech at Any Time (2007 NAICS)"
label var ht_any_ever_2012 "Any SEINUNIT of SEIN High-Tech at Any Time (2012 NAICS)"
label var ht_any_ever_2017 "Any SEINUNIT of SEIN High-Tech at Any Time (2017 NAICS)"


* Create high-tech startup, young, and incumbent outcomes.
local types mode job any any_ever
foreach type in `types' {

local years 2007 2012 2017
foreach year in `years' {

gen byte ht_`type'_start_`year' =  (ht_`type'_`year' == 1 & start == 1)
gen byte ht_`type'_young_`year' = (ht_`type'_`year' == 1 & young == 1)
gen byte ht_`type'_incum_`year' = (ht_`type'_`year' == 1 & incum == 1)

}
}

*create a firmage for using in the event study
gen firmage_ht = firmage if ht_mode_2012==1
gen firmage_priv = firmage if univ_ein == 0

gen carnegie_notumetrics = carnegie if umetrics_job_year!=1
gen univ_ein_notumetrics = univ_ein if umetrics_job_year!=1

compress
save lehd_pik_year_outcomes, replace

use lehd_pik_year_outcomes, clear
collapse (max) univ_ein* start young incum ht_* umetrics_job_year (min) firmage_ht firmage_priv carnegie*, by(pik year) fast

compress
save lehd_pik_year_outcomes_collapsed, replace


*Get count to make vars continous
use lehd_pik_year_outcomes, clear
gen firmid_sein = firmid_edit
replace firmid_sein = sein if missing(firmid_sein)
by pik year firmid_sein, sort: gen wage_rank = _n
keep if wage_rank == 1
drop wage_rank
*bys pik year: egen count_ht_mode_start_2012 = sum(ht_mode_start_2012)
collapse (sum) univ_ein* start young incum ht_*, by(pik year) fast
rename * count_*
rename count_pik pik
rename count_year year

merge 1:1 pik year using lehd_pik_year_outcomes_collapsed, update
drop _merge
compress
save lehd_pik_year_outcomes_collapsed, replace

********************
* Dislcosure info


**************
* Get firm identifiers
*Go back to the main file to get wages because the other one is quarterly
cd $datadir/intermediate_files
use iris_lehd_jobhistory_2018q4, clear
gen firmid_sein = firmid_edit
replace firmid_sein = sein if missing(firmid_sein)

keep pik year firmid_sein wage es_state
collapse (sum) wage, by(pik year firmid_sein es_state) fast
duplicates tag pik year firmid_sein, gen(dup)
gen missing_es_state = missing(es_state)
tab dup missing_es_state
drop dup missing_es_state
order pik year wage firmid_sein es_state
gsort pik year -wage firmid_sein es_state

*number of jobs
by pik year firmid_sein, sort: gen wage_rank = _n
keep if wage_rank == 1
drop wage_rank
bys pik year: egen num_jobs = count(wage)

by pik year, sort: gen wage_rank = _n
keep if wage_rank == 1
drop wage_rank

rename wage max_wage
**************


**************
* Get LEHD States
*Go back to the main file to get wages because the other one is quarterly
*cd $datadir/intermediate_files
*use iris_lehd_jobhistory_2018q4, clear
*gen firmid_sein = firmid_edit
*replace firmid_sein = sein if missing(firmid_sein)

*keep pik year firmid_sein wage
*collapse (sum) wage, by(pik year firmid_sein) fast
*bys pik year: egen chosen_wage = max(wage)
*gen chosen_firmid = firmid_sein if chosen_wage==wage

*collapse (max) wage (firstnm) chosen_firmid, by(pik year) fast

*rename wage max_wage
*rename chosen_firmid firmid
**************



merge 1:1 pik year using lehd_pik_year_outcomes_collapsed
drop _merge

compress
save lehd_pik_year_outcomes_collapsed, replace

********************

******************************************************************************
******************************************************************************






******************************************************************************
******************************************************************************

cd $datadir/intermediate_files
use iris_w2_jobhistory_2018q4.dta, clear
drop employeeid
duplicates drop
*duplicates tag pik ein year, gen(dup)
*tab dup
*drop dup
*keep pik ein firmid year wage_tip univ_ein firmage naics2012

****** ADD IN CARNEGIE *****
merge m:1 year ein using "$datadir/ipeds/uni_carnegie_class.dta", update keepusing(carnegie)
drop if _merge==2
drop _merge

merge m:1 ein using "$datadir/ipeds/uni_carnegie_class_mode_one_per_ein.dta", update keepusing(minmode_carnegie)
drop if _merge==2
drop _merge

replace carnegie = minmode_carnegie if missing(carnegie)
drop minmode_carnegie
replace carnegie = 9999 if carnegie<0

*Was the person working at a umetrics EIN or SEIN
merge m:1 pik year using "$datadir/raw_pulls/umetrics/FSRDC_2018/pik_year_at_umetricsjob.dta", update
gen umetrics_job_year = 1 if _merge>2
drop if _merge==2
drop _merge
***************


* Create indicators for startup, young and incumbent firms
gen start = (firmage <= 0 & univ_ein == 0)
gen young = (firmage > 0 & firmage <= 5 & univ_ein == 0)
gen incum = (firmage > 5 & univ_ein == 0) /*& wage_tip>0*/

merge m:1 year using `cpi_2014', update
drop if _merge==2
drop _merge

*Now do keep
keep pik ein firmid year wage_tip univ_ein firmage naics2012 start young incum carnegie umetrics_job_year

* Create High-Tech indicators. 
* Do this for NAICS years 2007, 2012.
* Use both the modal seinunit for a given sein-quarter (seinunit_mode) an
* For W2 I didn't create nacis2007 values, I could if we desided it was imporant.
gen ht_mode_2007 = .

local years 2012
foreach year in `years' {

gen naics4 = regexs(1) if regexm(naics`year', "^([0-9][0-9][0-9][0-9])")


gen ht_mode_`year' = 0
replace ht_mode_`year' = 1 if naics4 == "1131"
replace ht_mode_`year' = 1 if naics4 == "1132"
replace ht_mode_`year' = 1 if naics4 == "2111"
replace ht_mode_`year' = 1 if naics4 == "2211"
replace ht_mode_`year' = 1 if naics4 == "3241"
replace ht_mode_`year' = 1 if naics4 == "3251"
replace ht_mode_`year' = 1 if naics4 == "3252"
replace ht_mode_`year' = 1 if naics4 == "3253"
replace ht_mode_`year' = 1 if naics4 == "3254"
replace ht_mode_`year' = 1 if naics4 == "3255"
replace ht_mode_`year' = 1 if naics4 == "3259"
replace ht_mode_`year' = 1 if naics4 == "3332"
replace ht_mode_`year' = 1 if naics4 == "3336"
replace ht_mode_`year' = 1 if naics4 == "3339"
replace ht_mode_`year' = 1 if naics4 == "3341"
replace ht_mode_`year' = 1 if naics4 == "3342"
replace ht_mode_`year' = 1 if naics4 == "3343"
replace ht_mode_`year' = 1 if naics4 == "3344"
replace ht_mode_`year' = 1 if naics4 == "3345"
replace ht_mode_`year' = 1 if naics4 == "3346"
replace ht_mode_`year' = 1 if naics4 == "3353"
replace ht_mode_`year' = 1 if naics4 == "3364"
replace ht_mode_`year' = 1 if naics4 == "3369"
replace ht_mode_`year' = 1 if naics4 == "4324"
replace ht_mode_`year' = 1 if naics4 == "4861"
replace ht_mode_`year' = 1 if naics4 == "4862"
replace ht_mode_`year' = 1 if naics4 == "4869"
replace ht_mode_`year' = 1 if naics4 == "5112"
replace ht_mode_`year' = 1 if naics4 == "5161"
replace ht_mode_`year' = 1 if naics4 == "5171"
replace ht_mode_`year' = 1 if naics4 == "5172"
replace ht_mode_`year' = 1 if naics4 == "5173"
replace ht_mode_`year' = 1 if naics4 == "5174"
replace ht_mode_`year' = 1 if naics4 == "5179"
replace ht_mode_`year' = 1 if naics4 == "5181"
replace ht_mode_`year' = 1 if naics4 == "5182"
replace ht_mode_`year' = 1 if naics4 == "5211"
replace ht_mode_`year' = 1 if naics4 == "5232"
replace ht_mode_`year' = 1 if naics4 == "5413"
replace ht_mode_`year' = 1 if naics4 == "5415"
replace ht_mode_`year' = 1 if naics4 == "5416"
replace ht_mode_`year' = 1 if naics4 == "5417"
replace ht_mode_`year' = 1 if naics4 == "5511"
replace ht_mode_`year' = 1 if naics4 == "5612"
replace ht_mode_`year' = 1 if naics4 == "8112"

drop naics4

}



keep pik wage_tip year firmid ein firmage univ_ein start young incum ht_* wage_tip carnegie umetrics_job_year
order pik wage_tip year firmid ein firmage univ_ein start young incum ht_*


* Create high-tech startup, young, and incumbent outcomes.
local years 2007 2012
foreach year in `years' {

gen byte ht_mode_start_`year' = (ht_mode_`year' == 1 & start == 1)
gen byte ht_mode_young_`year' = (ht_mode_`year' == 1 & young == 1)
gen byte ht_mode_incum_`year' = (ht_mode_`year' == 1 & incum == 1)

}

*Collaps to get wages
collapse (sum) wage_tip (max) univ_ein start young incum ht_* umetrics_job_year (min) firmage carnegie, by(pik year firmid) fast


compress
save w2_pik_year_outcomes, replace


use w2_pik_year_outcomes, clear

bys pik year: egen chosen_wage = max(wage_tip)
bys pik year: egen num_jobs = count(wage_tip)

gen chosen_firmid = firmid if chosen_wage==wage_tip

*This creates other ways of defining univ

gen alt_univ_ein_max = univ_ein if chosen_wage==wage_tip
replace alt_univ_ein_max = 0 if missing(alt_univ_ein_max)
gen alt_univ_ein_max_notumet = univ_ein if umetrics_job_year!=1 & chosen_wage==wage_tip
replace alt_univ_ein_max_notumet = 0 if missing(alt_univ_ein_max_notumet)
gen alt_carnegie_max = carnegie if chosen_wage==wage_tip
gen alt_carnegie_max_notumet = carnegie if chosen_wage==wage_tip & umetrics_job_year!=1

gen alt_univ_ein_only = univ_ein if num_jobs==1
replace alt_univ_ein_only = 0 if missing(alt_univ_ein_only)

gen alt_univ_ein_1k = univ_ein if wage_tip>1000 & !missing(wage_tip)
replace alt_univ_ein_1k = 0 if missing(alt_univ_ein_1k)

gen alt_univ_ein_4k = univ_ein if wage_tip>4000 & !missing(wage_tip)
replace alt_univ_ein_4k = 0 if missing(alt_univ_ein_4k)

gen alt_univ_ein_1k_or_only = univ_ein if (wage_tip>1000 & !missing(wage_tip)) | num_jobs==1
replace alt_univ_ein_1k_or_only = 0 if missing(alt_univ_ein_1k_or_only)

gen carnegie_notumetrics = carnegie if umetrics_job_year!=1
gen univ_ein_notumetrics = univ_ein if umetrics_job_year!=1

*create a firmage for using in the event study
gen firmage_ht = firmage if ht_mode_2012==1
gen firmage_priv = firmage if univ_ein == 0

compress
save w2_pik_year_outcomes, replace

collapse (max) *univ_ein* start young incum ht_* wage_tip num_jobs (firstnm) chosen_firmid (min) firmage_ht firmage_priv *carnegie*, by(pik year) fast
drop if missing(year)
rename wage_tip max_wage
rename chosen_firmid firmid

compress
save w2_pik_year_outcomes_collapsed, replace

use w2_pik_year_outcomes, clear
drop if missing(year)
collapse (sum) *univ_ein* start young incum ht_* , by(pik year) fast
rename * count_*
rename count_pik pik
rename count_year year

merge 1:1 pik year using w2_pik_year_outcomes_collapsed, update
drop _merge
compress
save w2_pik_year_outcomes_collapsed, replace

******************************************************************************
******************************************************************************




use lehd_pik_year_outcomes_collapsed, clear

rename univ_ein_notumetrics univ_ein_notumet
rename count_univ_ein_notumetrics count_univ_ein_notumet 
renvars univ_ein* start young incum ht_* max_wage firmage_ht firmage_priv firmid es_state carnegie carnegie_notumetrics num_jobs, prefix(lehd_)
drop count_ht_any_ever* /* because these are too long*/
renvars count_* , prefix(lehd_)

drop if year < 2001
drop if year > 2017
drop if missing(year)
merge 1:1 pik year using pik_year_panel
tab flag_pik_source _merge
drop _merge
*mvencode _all, mv(0) override
*I don't want to recode max_wage or firmid of firmage_ht or es_state
mvencode lehd_univ_ein lehd_start lehd_young lehd_incum lehd_ht_* lehd_count_*, mv(0) override
compress
save pik_year_panel_outcomes, replace

* W2 only goes from 2005 to 2017. Check results if limited to this period?
use w2_pik_year_outcomes_collapsed, clear

rename univ_ein_notumetrics univ_ein_notumet
rename count_univ_ein_notumetrics count_univ_ein_notumet 
rename count_alt_univ_ein_max_notumet count_univ_ein_max_notumet
renvars *univ_ein* start young incum ht_* max_wage firmage_ht firmage_priv firmid *carnegie* num_jobs, prefix(w2_)
renvars count_*, prefix(w2_)
*exist_* pay_growth_* // don't add a prefix to these

drop if year < 2001
drop if year > 2017
drop if missing(year)
merge 1:1 pik year using pik_year_panel
tab flag_pik_source _merge
drop _merge
*mvencode _all, mv(0) override
*I don't want to recode max_wage exist_* or pay_growth_* or firmage_ht or firmid
mvencode w2_*univ_ein* w2_start w2_young w2_*incum* w2_ht_* w2_count_*, mv(0) override

merge 1:1 pik year using pik_year_panel_outcomes
drop _merge
compress
tsset pik_num year
save pik_year_panel_outcomes, replace



use pik_year_panel_outcomes, clear

local commonvars univ_ein start young incum ///
                 ht_mode_2012 ///
		 ht_mode_start_2012 ht_mode_young_2012 ht_mode_incum_2012 ///
		 univ_ein_notumet

foreach commonvar in `commonvars' {

	gen `commonvar' = (lehd_`commonvar' == 1 | w2_`commonvar' == 1)
}

foreach commonvar in `commonvars' {

	egen count_`commonvar' = rowmax(lehd_count_`commonvar' w2_count_`commonvar')
}

local univ_vars alt_univ_ein_max alt_univ_ein_only alt_univ_ein_1k alt_univ_ein_4k alt_univ_ein_1k_or_only
foreach univ_var in `univ_vars' {
	gen `univ_var' = (lehd_univ_ein== 1 | w2_`univ_var' == 1)
}
gen alt_univ_ein_max_notumet = (lehd_univ_ein_notumet== 1 | w2_alt_univ_ein_max_notumet == 1)

foreach univ_var in `univ_vars' {
	egen count_`univ_var' = rowmax (lehd_count_univ_ein w2_count_`univ_var')
}
egen count_alt_univ_ein_max_notumet = rowmax(lehd_count_univ_ein_notumet w2_count_univ_ein_max_notumet)

merge m:1 year using `cpi_2014', update
drop if _merge==2
drop _merge


egen max_wage = rowmax(lehd_max_wage w2_max_wage)
gen firmid = lehd_firmid if max_wage==lehd_max_wage
replace firmid = w2_firmid if max_wage==w2_max_wage

replace max_wage = max_wage*cpi_2014_mult
replace lehd_max_wage = lehd_max_wage*cpi_2014_mult

egen firmage_ht = rowmin(lehd_firmage_ht w2_firmage_ht)
egen firmage_priv = rowmin(lehd_firmage_priv w2_firmage_priv)
egen carnegie = rowmin(lehd_carnegie w2_carnegie)
egen carnegie_notumetrics = rowmin(lehd_carnegie_notumetrics w2_carnegie_notumetrics)
egen alt_carnegie_max = rowmin(lehd_carnegie w2_alt_carnegie_max)
egen alt_carnegie_max_notumet = rowmin(lehd_carnegie_notumetrics w2_alt_carnegie_max_notumet)

keep pik_num pik year flag_pik_source ///
     `commonvars' ///
     count_* lehd_count_* ///
     carnegie lehd_carnegie carnegie_notumetrics lehd_carnegie_notumetrics ///
     alt_carnegie_max alt_carnegie_max_notumet ///
     alt_univ_ein_max_notumet ///
     max_wage lehd_max_wage ///
     firmage_ht lehd_firmage_ht firmage_priv lehd_firmage_priv ///
     lehd_univ_ein lehd_start lehd_young lehd_incum ///
     lehd_ht_mode_2012 ///
     lehd_ht_mode_start_2012 lehd_ht_mode_young_2012 lehd_ht_mode_incum_2012 ///
     lehd_univ_ein_notumet ///
     w2_num_jobs lehd_num_jobs ///
     firmid lehd_firmid lehd_es_state ///
     `univ_vars' `incum_vars'
		 
		 
order pik_num
sort pik_num year
tsset pik_num year

order pik pik_num year flag_pik_source ///
      *univ_ein* lehd_univ_ein ///
      start lehd_start ///
      young lehd_young ///
      incum lehd_incum ///
      ht_mode_2012 lehd_ht_mode_2012 ///
      ht_mode_start_2012 lehd_ht_mode_start_2012 ///
      ht_mode_young_2012 lehd_ht_mode_young_2012 ///
      ht_mode_incum_2012 lehd_ht_mode_incum_2012 ///
      max_wage lehd_max_wage

compress
cd $datadir/intermediate_files
save pik_year_panel_outcomes, replace




































*Merge in UMETRICS IDs
cd $datadir/raw_pulls/umetrics/FSRDC_2018
use usiris_emnmxwalk_2018q4a_2018q4a, clear
keep emplid pik storgabbr
drop if missing(pik)
* This needs to by a joinby instead of a merge because there can be multiple PIKs for a given UMETRICS ID.
cd $datadir/intermediate_files
joinby pik using pik_year_panel_outcomes, unmatched(using)
drop _merge
rename emplid employeeid
rename employeeid iris_employee_number

*Umetrics data
merge 1:1 iris_employee_number year using "$datadir/intermediate_files/final_multicfda.dta", update
drop if _merge<3
drop _merge

*Megre in modal cfda codes
merge m:1 iris_employee_number using "$datadir/intermediate_files/cfda_clusters.dta", update
drop if _merge==2
drop _merge


capture noisily egen emp_num = group(iris_employee_number)
tsset emp_num year


compress
save "$datadir/jobhist_lehd_E.dta", replace


*** Sabrina's code for occupations ***
* 12 Occupations
replace umetrics_occupational_class =lower(umetrics_occupational_class )

gen xx=umetrics_occupational_class =="faculty"
replace xx=0 if missing(xx)
bysort pik_num: egen occup_faculty=max(xx)

gen xxx=umetrics_occupational_class =="graduate student"|umetrics_occupational_class =="post graduate research"|umetrics_occupational_class =="research"
replace xxx=0 if missing(xxx)
bysort pik: egen occup_gradpostdoc=max(xxx)

gen xxxx=umetrics_occupational_class =="undergraduate"
replace xxxx=0 if missing(xxxx)
bysort pik: egen occup_undergradstudent=max(xxxx)

gen xxxx2= (umetrics_occupational_class!="" & occup_faculty==0 & occup_gradpostdoc==0 & occup_undergradstudent==0)

bysort pik: egen occup_other_staff=max(xxxx2)
gen occup_tech_other_staff = occup_other_staff



*research facilitation for discloure
gen xxxx3=(umetrics_occupational_class =="research facilitation" & occup_faculty==0 & occup_gradpostdoc==0 & occup_undergradstudent==0)
bysort pik: egen occup_researchfac=max(xxxx3)

drop xx*
sum occup_*


gen log_wage=log(max_wage)
replace log_wage=0 if log_wage==.

tsset emp_num year
compress
save "$datadir/jobhist_lehd_E.dta", replace

